# Replication file for "Going the last Mile"

# Code necessary to replicate all tables and figures based on IPUMS data
# Table 1, 3; Figure 3, 5 in the main text; Appendix Table B.1, Figure B.1, Table B.2, Table B.3, Table B.4, Table C.1, Figure C.1, Table C.2, Table C.3, Table C.4

# create directory "../replication files/going the last mile" to save files and output

# R version 4.2.1
# install DemoTools package, see https://timriffe.github.io/DemoTools/
# install.packages("rstan", repos = c("https://mc-stan.org/r-packages/", getOption("repos")))
# remotes::install_github("timriffe/DemoTools")
# install.packages("Rcpp")
# install.packages("ipumsr")
# install.packages("tidyverse")

library(dplyr)
library(Rcpp)
library(DemoTools)
library(ipumsr)
library(ggplot2)
library(tidyr)


# Request census data from IPUMS International (https://international.ipums.org/international/)
# Select samples: Bolivia 1992 and 2001; Mexico 2000 and 2010
# Select harmonized variables: URBAN, ELECTRIC, WATSUP, SEWAGE, AGE, SEX, LIT, SPEAKIND

# This dataset contains census data for Mexico and Bolivia.
# Countries are selected by filtering based on the year of the census.
# Data for Mexico: 2000, 2010. Bolivia: 1992, 2001

# After obtaining and downloading file from IPUMS, adjust file name and directory as appropriate below. 
# ddi <- read_ipums_ddi("../ipumsi_XXXXX.xml")
ipums_data <- read_ipums_micro(ddi)
ipums_conditions()

# Prepare census data
census_data <- ipums_data |>
  filter(AGE > 20 & AGE < 100) |>
  mutate(
    # Dummy for heaped age (ending in 0 and 5)
    heaped = if_else(substr(AGE, 2, 2) %in% c("0", "5"),
      1, 0
    ),

    # Variable for "decade of life"
    decade = ifelse(AGE > 20 & AGE < 100,
      substr(AGE, 1, 1),
      NA
    ),
    decade_num = as.numeric(decade),

    # recode sex (female = 1)
    female = case_when(
      SEX == 2 ~ 1,
      SEX == 1 ~ 0,
      SEX == 9 ~ NA
    ),

    # recode illiterate (illiterate = 1)
    illiterate = case_when(
      LIT == 1 ~ 1,
      LIT == 2 ~ 0,
      LIT %in% c(0, 9) ~ NA
    ),

    # recode rural (rural = 1)
    rural = case_when(
      URBAN == 1 ~ 1,
      URBAN == 2 ~ 0,
      URBAN == 9 ~ NA
    ),

    # recode indigenous (speaks indigenous language = 1)
    indigenous = case_when(
      SPEAKIND > 0 & SPEAKIND < 4 ~ 1,
      SPEAKIND == 4 ~ 0,
      SPEAKIND %in% c(0, 8, 9) ~ NA
    ),

    # recode electricity (negative answer = 1, i.e. no electricity)
    no_electricity = case_when(
      ELECTRIC == 2 ~ 1,
      ELECTRIC == 1 ~ 0,
      ELECTRIC %in% c(0, 9) ~ NA
    ),

    # recode water (negative answer = 1, i.e. no water)
    no_water = case_when(
      WATSUP == 20 ~ 1,
      WATSUP > 9 & WATSUP < 20 ~ 0,
      WATSUP %in% c(0, 99) ~ NA
    ),

    # recode sewage (negative answer = 1, i.e. no connection to sewage)
    no_sewage = case_when(
      SEWAGE == 20 ~ 1,
      SEWAGE > 9 & SEWAGE < 20 ~ 0,
      SEWAGE %in% c(0, 99) ~ NA
    )
  )


# Mexico ------------------------------------------------------------------

# Define eligibility, using the first census (2000) as the base
ineligible_min <- 40
ineligible_max <- 59
eligible_min <- 64
eligible_max <- 83

census_data_Mex <- census_data |>
  filter(YEAR %in% c(2000, 2010)) |>
  mutate(
    age_consistent = if_else(YEAR == 2010, AGE - 10, AGE),
    decade_consistent = substr(age_consistent, 1, 1),
    decade_consistent_num = as.numeric(decade_consistent),
    # create treatment and eligibility indicator for DiD analysis
    # treatment: consistent age = 64-83; control: consistent age = 40-59
    pension = if_else(YEAR == 2010, 1, 0),
    eligible = case_when(
      age_consistent >= ineligible_min & age_consistent <= ineligible_max ~ 0,
      age_consistent >= eligible_min & age_consistent <= eligible_max ~ 1,
      TRUE ~ NA
    ),
    did = eligible * pension
  )

# Table 1
# 2000
model1 <- glm(
  heaped ~ illiterate + indigenous + female + decade_consistent_num + no_electricity + no_water + no_sewage + rural,
  family = binomial,
  data = subset(census_data_Mex, YEAR == 2000 & age_consistent > 39 & age_consistent < 80)
)
summary(model1)
exp(coef(model1))

# 2010
model2 <- glm(
  heaped ~ illiterate + indigenous + female + decade_consistent_num + no_electricity + no_water + no_sewage + rural,
  family = binomial,
  data = subset(census_data_Mex, YEAR == 2010 & age_consistent > 39 & age_consistent < 80)
)
summary(model2)
exp(coef(model2))

texreg::screenreg(list(model1, model2))
texreg::texreg(list(model1, model2), file = "replication files/going the last mile/table 1.tex")

# predicted probabilities
decade_2000 <- c("40s", "50s", "60s", "70s")
Mex2000 <- with(census_data_Mex, data.frame(decade_consistent_num = c(4:7), illiterate = 1, female = 1, no_electricity = 1, no_water = 1, no_sewage = 1, rural = 1, indigenous = 1))
Mex2010 <- with(census_data_Mex, data.frame(decade_consistent_num = c(4:7), illiterate = 1, female = 1, no_electricity = 1, no_water = 1, no_sewage = 1, rural = 1, indigenous = 1))

predict_2000 <- predict(model1, Mex2000, type = "response")
predict_2010 <- predict(model2, Mex2010, type = "response")

probabilities_mex <- data.frame(decade_2000, predict_2000, predict_2010)
probabilities_mex

# Plot Figure 3
# eligible 0 = "Cohort Age 40-59 in 2000, 50-60 in 2010", 1 = "Cohort Age 64-83 in 2000, 74-93 in 2010"
plot_data <- census_data_Mex |>
  filter(eligible == 1 | eligible == 0) |>
  mutate(pension = factor(pension, labels = c("2000", "2010"))) |>
  group_by(eligible, pension) |>
  summarize(
    mean_heaped = mean(heaped),
    se_heaped = sd(heaped) / sqrt(n()),
    upper = mean_heaped + (-1.96 * se_heaped),
    lower = mean_heaped + (1.96 * se_heaped)
  )

plot_data |>
  ggplot(aes(
    x = pension,
    y = mean_heaped,
    linetype = factor(
      eligible,
      levels = c(1, 0),
      labels = stringr::str_wrap(c(
        "Cohort Age 64-83 in 2000, 74-93 in 2010",
        "Cohort Age 40-59 in 2000, 50-69 in 2010"
      ), 20)
    )
  )) +
  geom_point() +
  geom_errorbar(
    aes(ymin = lower, ymax = upper),
    width = 0,
    linewidth = 1,
    linetype = 1
  ) +
  geom_line(aes(group = eligible), linewidth = 1) +
  scale_y_continuous(breaks = seq(0.2, 0.40, 0.05)) +
  coord_cartesian(ylim = c(0.2, 0.4)) +
  labs(y = "Proportion heaped", x = "Census year", linetype = "Cohort") +
  jtools::theme_apa()

ggsave("replication files/going the last mile/figure 3.png", width = 5, height = 3)

## Appendix
# Age heaping Table B.1
age_heaping_table_b1 <- census_data_Mex |>
  group_by(YEAR, eligible) |>
  summarise(age_n = n()) |>
  filter(!is.na(eligible)) |>
  pivot_wider(
    names_from = YEAR,
    values_from = age_n,
    names_prefix = "year"
  ) |>
  mutate(
    age_group = case_when(
      eligible == 0 ~ paste0(ineligible_min, " - ", ineligible_max),
      eligible == 1 ~ paste0(eligible_min, " - ", eligible_max)
    ),
    age_group_2010 = case_when(
      eligible == 0 ~ paste0(ineligible_min + 10, " - ", ineligible_max + 10),
      eligible == 1 ~ paste0(eligible_min + 10, " - ", eligible_max + 10)
    )
  ) |>
  select(age_group, year2000, age_group_2010, year2010)

age_heaping_table_b1
age_heaping_table_b1 |>
  kableExtra::kable(
    format = "latex",
    col.names = c("Age 2000", "Count 2000", "Age 2010", "Count 2010")
  ) |>
  kableExtra::save_kable("replication files/going the last mile/table b1.tex")

# heaping graphs, Figure B.1
ageheaping_all <- ipums_data |>
  filter(YEAR %in% c(2000, 2010), AGE != 999) |>
  group_by(YEAR, AGE) |>
  summarise(age_n = n())

# 1 = illiterate, 2 = literate
ageheaping_literate <- ipums_data |>
  filter(YEAR %in% c(2000, 2010), AGE != 999) |>
  group_by(YEAR, AGE, LIT) |>
  summarise(age_n = n())

# 1 = rural, 2 = urban
ageheaping_urban_rural <- ipums_data |>
  filter(YEAR %in% c(2000, 2010), AGE != 999) |>
  group_by(YEAR, AGE, URBAN) |>
  summarise(age_n = n())

# 2000
age2000 <- ageheaping_all[ageheaping_all$YEAR == 2000, ]$AGE

heaping2000_all <- ageheaping_all[ageheaping_all$YEAR == 2000, ]$age_n
heaping2000_illiterate <- ageheaping_literate[ageheaping_literate$YEAR == 2000 & ageheaping_literate$LIT == 1, ]$age_n
heaping2000_rural <- ageheaping_urban_rural[ageheaping_urban_rural$YEAR == 2000 & ageheaping_urban_rural$URBAN == 1, ]$age_n

# The variables illiterate and rural do not have complete data for older ages, so only analyze data available
v2000_all <- check_heaping_myers(heaping2000_all, age2000, eligible_min, eligible_max)
v2000_illiterate <- check_heaping_myers(heaping2000_illiterate, age2000[seq_along(heaping2000_illiterate)], eligible_min, eligible_max)
v2000_rural <- check_heaping_myers(heaping2000_rural, age2000[seq_along(heaping2000_rural)], eligible_min, eligible_max)

# 2010
age2010 <- ageheaping_all[ageheaping_all$YEAR == 2010, ]$AGE

heaping2010_all <- ageheaping_all[ageheaping_all$YEAR == 2010, ]$age_n
heaping2010_illiterate <- ageheaping_literate[ageheaping_literate$YEAR == 2010 & ageheaping_literate$LIT == 1, ]$age_n
heaping2010_rural <- ageheaping_urban_rural[ageheaping_urban_rural$YEAR == 2010 & ageheaping_urban_rural$URBAN == 1, ]$age_n

v2010_all <- check_heaping_myers(heaping2010_all, age2010, eligible_min + 10, eligible_max + 10)
v2010_illiterate <- check_heaping_myers(heaping2010_illiterate, age2010[seq_along(heaping2010_illiterate)], eligible_min + 10, eligible_max + 10)
v2010_rural <- check_heaping_myers(heaping2010_rural, age2010[seq_along(heaping2010_rural)], eligible_min + 10, eligible_max + 10)


cohort_names <- c("Mean", "Illiterate", "Rural")
trends2000_2010 <- data.frame(
  cohort = factor(rep(cohort_names, 2), levels = cohort_names[c(2, 3, 1)]),
  census = rep(c(2000, 2010), each = 3),
  myers_index = c(
    v2000_all, v2000_illiterate, v2000_rural,
    v2010_all, v2010_illiterate, v2010_rural
  )
)

trends2000_2010 |>
  ggplot(aes(x = census, y = myers_index, color = cohort)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  labs(
    x = "Census year",
    y = "Myers Blended Index",
    colour = "Cohort"
  ) +
  scale_x_continuous(breaks = seq(2000, 2010, 10)) +
  scale_color_grey() +
  jtools::theme_apa()

ggsave("replication files/going the last mile/figure b1.png", width = 5, height = 3)


# Descriptives Table B.2, B.3
descriptives_mexico <- census_data_Mex |>
  filter(age_consistent > 39, age_consistent < 80) |>
  select(YEAR, illiterate, indigenous, female, decade_consistent_num, no_electricity, no_water, no_sewage, rural)

descriptives_mexico <- as.data.frame(descriptives_mexico)
variable_labels <- c("Illiterate", "Speaks Indigenous Language", "Female", "Decade of Life (Consistent)", "No electricity", "No water", "No sewage", "Rural")

stargazer::stargazer(subset(descriptives_mexico, YEAR == 2000, select = -1), title = "Descriptive statistics Mexico 2000", covariate.labels = variable_labels, out = "replication files/going the last mile/table b2.tex")
stargazer::stargazer(subset(descriptives_mexico, YEAR == 2010, select = -1), title = "Descriptive statistics Mexico 2010", covariate.labels = variable_labels, out = "replication files/going the last mile/table b3.tex")

# DID analysis Table B.4
# GLM model
model5 <- glm(heaped ~ eligible + pension + did, data = census_data_Mex, family = binomial)
summary(model5)
exp(coef(model5))

Mex_did <- with(census_data_Mex, data.frame(eligible = c(0, 1), pension = 1, did = 1))
predict(model5, Mex_did, type = "response")

# Linear model
model6 <- lm(heaped ~ eligible + pension + did, data = census_data_Mex)
summary(model6)

texreg::screenreg(list(model5, model6))
texreg::texreg(list(model5, model6), file = "replication files/going the last mile/table b4.tex")




# Bolivia -----------------------------------------------------------------

# Define the eligibility, using the first census (1992) as the base
ineligible_min <- 40
ineligible_max <- 59
eligible_min <- 60
eligible_max <- 79

census_data_Bol <- census_data |>
  filter(YEAR %in% c(1992, 2001)) |>
  mutate(
    age_consistent = if_else(YEAR == 2001, AGE - 9, AGE),
    decade_consistent = substr(age_consistent, 1, 1),
    decade_consistent_num = as.numeric(decade_consistent),
    # create treatment and eligibility indicator for DiD analysis
    # treatment: consistent age = 40-59; control: consistent age = 60-79
    pension = if_else(YEAR == 2001, 1, 0),
    eligible = case_when(
      age_consistent >= ineligible_min & age_consistent <= ineligible_max ~ 0,
      age_consistent >= eligible_min & age_consistent <= eligible_max ~ 1
    ),
    did = eligible * pension
  )


# Models Table 3
# 1992
model3 <- glm(heaped ~ illiterate + female + decade_consistent_num + no_electricity + no_water + no_sewage + rural, family = binomial, data = subset(census_data_Bol, YEAR == 1992 & age_consistent > 39 & age_consistent < 80))
summary(model3)

# 2001
model4 <- glm(heaped ~ illiterate + female + decade_consistent_num + no_electricity + no_water + no_sewage + rural, family = binomial, data = subset(census_data_Bol, YEAR == 2001 & age_consistent > 39 & age_consistent < 80))
summary(model4)

# 2001
model4a <- glm(heaped ~ illiterate + female + decade_consistent_num + no_electricity + no_water + no_sewage + rural + indigenous, family = binomial, data = subset(census_data_Bol, YEAR == 2001 & age_consistent > 39 & age_consistent < 80))
summary(model4a)

texreg::screenreg(list(model3, model4, model4a))
texreg::texreg(list(model3, model4, model4a), file = "replication files/going the last mile/table 3.tex")

# Predicted probabilities
decade_1992 <- c("40s", "50s", "60s", "70s")
Bol1992 <- with(census_data_Bol, data.frame(decade_consistent_num = c(4:7), illiterate = 1, female = 1, no_electricity = 0, no_water = 0, no_sewage = 0, rural = 1))
Bol2001 <- with(census_data_Bol, data.frame(decade_consistent_num = c(4:7), illiterate = 1, female = 1, no_electricity = 0, no_water = 0, no_sewage = 0, rural = 1))

predict_1992 <- predict(model3, Bol1992, type = "response")
predict_2001 <- predict(model4, Bol2001, type = "response")

probabilities_Bol <- data.frame(decade_1992, predict_1992, predict_2001)
probabilities_Bol


# Plot Figure 5
plot_data <- census_data_Bol |>
  filter(eligible == 1 | eligible == 0) |>
  mutate(pension = factor(pension, labels = c("1992", "2001"))) |>
  group_by(eligible, pension) |>
  summarize(
    mean_heaped = mean(heaped),
    se_heaped = sd(heaped) / sqrt(n()),
    upper = mean_heaped + (-1.96 * se_heaped),
    lower = mean_heaped + (1.96 * se_heaped)
  )

plot_data |>
  ggplot(aes(
    x = pension,
    y = mean_heaped,
    linetype = factor(
      eligible,
      levels = c(1, 0),
      labels = stringr::str_wrap(
        c(
          "Cohort Age 60-79 in 1992, 69-88 in 2001",
          "Cohort Age 40-59 in 1992, 49-68 in 2001"
        ), 20
      )
    )
  )) +
  geom_point() +
  geom_errorbar(
    aes(ymin = lower, ymax = upper),
    width = 0,
    linewidth = 1,
    linetype = 1
  ) +
  geom_line(aes(group = eligible), linewidth = 1) +
  scale_y_continuous(breaks = seq(0.2, 0.40, 0.05)) +
  coord_cartesian(ylim = c(0.2, 0.4)) +
  labs(y = "Proportion heaped", x = "Census year", linetype = "Cohort") +
  jtools::theme_apa()

ggsave("replication files/going the last mile/figure 5.png", width = 5, height = 3)

## Appendix

# Table C.1
age_heaping_table_c1 <-
  census_data_Bol |>
  group_by(YEAR, eligible) |>
  summarise(age_n = n()) |>
  filter(!is.na(eligible)) |>
  pivot_wider(
    names_from = YEAR,
    values_from = age_n,
    names_prefix = "year"
  ) |>
  mutate(
    age_group = case_when(
      eligible == 0 ~ paste0(ineligible_min, " - ", ineligible_max),
      eligible == 1 ~ paste0(eligible_min, " - ", eligible_max)
    ),
    age_group_2001 = case_when(
      eligible == 0 ~ paste0(ineligible_min + 9, " - ", ineligible_max + 9),
      eligible == 1 ~ paste0(eligible_min + 9, " - ", eligible_max + 9)
    )
  ) |>
  select(age_group, year1992, age_group_2001, year2001)

age_heaping_table_c1
age_heaping_table_c1 |>
  kableExtra::kable(
    format = "latex",
    col.names = c("Age 1992", "Count 1992", "Age 2001", "Count 2001")
  ) |>
  kableExtra::save_kable("replication files/going the last mile/table c1.tex")

# heaping graphs, Figure C.1
ageheaping_all <- ipums_data |>
  filter(YEAR %in% c(1992, 2001), AGE != 999) |>
  group_by(YEAR, AGE) |>
  summarise(age_n = n())

# 1 = illiterate, 2 = literate
ageheaping_literate <- ipums_data |>
  filter(YEAR %in% c(1992, 2001), AGE != 999) |>
  group_by(YEAR, AGE, LIT) |>
  summarise(age_n = n())

# 1 = rural, 2 = urban
ageheaping_urban_rural <- ipums_data |>
  filter(YEAR %in% c(1992, 2001), AGE != 999) |>
  group_by(YEAR, AGE, URBAN) |>
  summarise(age_n = n())

# 1992
age1992 <- ageheaping_all[ageheaping_all$YEAR == 1992, ]$AGE

heaping1992_all <- ageheaping_all[ageheaping_all$YEAR == 1992, ]$age_n
heaping1992_illiterate <- ageheaping_literate[ageheaping_literate$YEAR == 1992 & ageheaping_literate$LIT == 1, ]$age_n
heaping1992_rural <- ageheaping_urban_rural[ageheaping_urban_rural$YEAR == 1992 & ageheaping_urban_rural$URBAN == 1, ]$age_n

# Variables illiterate and rural data do not have complete data for older ages, so only analyse data available
v1992_all <- check_heaping_myers(heaping1992_all, age1992, eligible_min, eligible_max)
v1992_illiterate <- check_heaping_myers(heaping1992_illiterate, age1992[seq_along(heaping1992_illiterate)], eligible_min, eligible_max)
v1992_rural <- check_heaping_myers(heaping1992_rural, age1992[seq_along(heaping1992_rural)], eligible_min, eligible_max)

# 2001
age2001 <- ageheaping_all[ageheaping_all$YEAR == 2001, ]$AGE

heaping2001_all <- ageheaping_all[ageheaping_all$YEAR == 2001, ]$age_n
heaping2001_illiterate <- ageheaping_literate[ageheaping_literate$YEAR == 2001 & ageheaping_literate$LIT == 1, ]$age_n
heaping2001_rural <- ageheaping_urban_rural[ageheaping_urban_rural$YEAR == 2001 & ageheaping_urban_rural$URBAN == 1, ]$age_n


v2001_all <- check_heaping_myers(heaping2001_all, age2001, eligible_min + 9, eligible_max + 9)
v2001_illiterate <- check_heaping_myers(heaping2001_illiterate, age2001[seq_along(heaping2001_illiterate)], eligible_min + 9, eligible_max + 9)
v2001_rural <- check_heaping_myers(heaping2001_rural, age2001[seq_along(heaping2001_rural)], eligible_min + 9, eligible_max + 9)


cohort_names <- c("Mean", "Illiterate", "Rural")
trends1992_2001 <- data.frame(
  cohort = factor(rep(cohort_names, 2), levels = cohort_names[c(2, 3, 1)]),
  census = rep(c(1992, 2001), each = 3),
  myers_index = c(
    v1992_all, v1992_illiterate, v1992_rural,
    v2001_all, v2001_illiterate, v2001_rural
  )
)

trends1992_2001 |>
  ggplot(aes(x = census, y = myers_index, color = cohort)) +
  geom_line(linewidth = 1) +
  geom_point(size = 2) +
  labs(
    x = "Census year",
    y = "Myers Blended Index",
    colour = "Cohort"
  ) +
  scale_x_continuous(breaks = seq(1992, 2001, 9)) +
  scale_color_grey() +
  jtools::theme_apa()

ggsave("replication files/going the last mile/figure c1.png", width = 5, height = 3)



# Descriptives Table C.2, C.3
# Bolivia 1992 does not have data on indigenous language
descriptives_bolivia <- census_data_Bol |>
  filter(age_consistent > 39, age_consistent < 80) |>
  select(YEAR, illiterate, indigenous, female, decade_consistent_num, no_electricity, no_water, no_sewage, rural)

descriptives_bolivia <- as.data.frame(descriptives_bolivia)
variable_labels <- c("Illiterate", "Female", "Decade of Life (Consistent)", "No electricity", "No water", "No sewage", "Rural")
stargazer::stargazer(subset(descriptives_bolivia, YEAR == 1992, select = -1), title = "Descriptive statistics Bolivia 1992", covariate.labels = variable_labels, out = "replication files/going the last mile/table c2.tex")

variable_labels <- c("Illiterate", "Speaks  IndigenousLanguage", "Female", "Decade of Life (Consistent)", "No electricity", "No water", "No sewage", "Rural")
stargazer::stargazer(subset(descriptives_bolivia, YEAR == 2001, select = -1), title = "Descriptive statistics Bolivia 2001", covariate.labels = variable_labels, out = "replication files/going the last mile/table c3.tex")

# Table C.4
# Logit model
model7 <- glm(heaped ~ eligible + pension + did, data = census_data_Bol, family = binomial)
summary(model7)

# Linear model
model8 <- lm(heaped ~ eligible + pension + did, data = census_data_Bol)
summary(model8)

texreg::screenreg(list(model7, model8))
texreg::texreg(list(model7, model8), file = "replication files/going the last mile/table c4.tex")
